print("hehehehe")## [1] "hehehehe"
print("hahahhaa")## [1] "hahahhaa"
print("hihihihi")## [1] "hihihihi"
# Comment here
# print("hahahaha")# source("W0_install_pkgs.R")library()載入已經安裝好的套件options()新增適用於整份程式碼(全域)的參數值library(tidyverse)## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
options(stringsAsFactors = F)print("hehehehe")## [1] "hehehehe"
"yoyoyoyo"## [1] "yoyoyoyo"
message("hahahaha")## hahahaha
<-的符號,將右方的物件Assign(指派)給左方的變數(variable)。a、b、d均為變數名稱。c()是R中最基本的資料型態Vector,可以把它想像成是Google sheet的一個垂直欄。a <- c(1, 2, 3, 4, 5)
b <- c(3, 4, 5, 6, 7)
d <- a + b
d## [1] 4 6 8 10 12
tidyr’s read_csv() function.users_1 <- read_csv("data/china_082019_1_users_csv_hashed.csv")## Parsed with column specification:
## cols(
## userid = col_character(),
## user_display_name = col_character(),
## user_screen_name = col_character(),
## user_reported_location = col_character(),
## user_profile_description = col_character(),
## user_profile_url = col_character(),
## follower_count = col_double(),
## following_count = col_double(),
## account_creation_date = col_date(format = ""),
## account_language = col_character()
## )
read_csv()讀取set1的tweets資料並assign給新變數tweets_1。read_csv()讀取set2的users資料並assign給新變數users_1。read_csv()讀取set2的tweets資料並assign給新變數tweets_2。# tweets_1 <-
# users_1 <-
# tweets_2 <-
# source("sol/2_1p_sol.R")View(): 用RStuiod的視窗觀察資料。head(): 抽取出前六筆(也就是前六列資料來觀察)。# View(users_1)
# users_1 %>% View
# View(head(users_1))
# users_1 %>% head %>% View
users_1 %>% head class(): 用以觀察該變數的資料型態(types)。tidyr套件所取出的資料均會轉為dataframe,dataframe就是我們所稱的資料表。dim(): 取得資料維度(dimensions)。本課程多將「資料」想像為一個列x欄的資料表(此稱為data frame),每列(row)為一筆資料(或稱觀察值,Observation)、每欄(col)為一個變數(或變項,variables)。而dim()是以observsations x variables來呈現。class(users_1)## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
# "spec_tbl_df" "tbl_df" "tbl" "data.frame"
users_1 %>% class## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
dim(users_1)## [1] 744 10
users_1 %>% dim## [1] 744 10
glimpse(): 縱覽變數、變數型態(type)、與變數內容範例summary(): 提供該data frame的變數摘要,包含每個變數的平均數、四分位數、極大值等。glimpse(users_1)## Observations: 744
## Variables: 10
## $ userid <chr> "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYy…
## $ user_display_name <chr> "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYy…
## $ user_screen_name <chr> "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYy…
## $ user_reported_location <chr> NA, NA, NA, NA, NA, "California, USA", …
## $ user_profile_description <chr> NA, NA, NA, NA, NA, "cool", NA, NA, NA,…
## $ user_profile_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ follower_count <dbl> 1, 0, 23949, 17, 0, 17116, 0, 1, 2, 12,…
## $ following_count <dbl> 52, 0, 52, 34, 0, 76, 39, 46, 127, 64, …
## $ account_creation_date <date> 2017-08-30, 2017-10-16, 2016-06-27, 20…
## $ account_language <chr> "zh-cn", "zh-cn", "zh-cn", "es", "zh-tw…
users_1 %>% glimpse()## Observations: 744
## Variables: 10
## $ userid <chr> "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYy…
## $ user_display_name <chr> "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYy…
## $ user_screen_name <chr> "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYy…
## $ user_reported_location <chr> NA, NA, NA, NA, NA, "California, USA", …
## $ user_profile_description <chr> NA, NA, NA, NA, NA, "cool", NA, NA, NA,…
## $ user_profile_url <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ follower_count <dbl> 1, 0, 23949, 17, 0, 17116, 0, 1, 2, 12,…
## $ following_count <dbl> 52, 0, 52, 34, 0, 76, 39, 46, 127, 64, …
## $ account_creation_date <date> 2017-08-30, 2017-10-16, 2016-06-27, 20…
## $ account_language <chr> "zh-cn", "zh-cn", "zh-cn", "es", "zh-tw…
str(users_1)## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 744 obs. of 10 variables:
## $ userid : chr "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE=" "919755217121316864" "747292706536226816" "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw=" ...
## $ user_display_name : chr "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE=" "ailaiyi5" "牛小牛" "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw=" ...
## $ user_screen_name : chr "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE=" "wuming11xia" "gurevadona88" "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw=" ...
## $ user_reported_location : chr NA NA NA NA ...
## $ user_profile_description: chr NA NA NA NA ...
## $ user_profile_url : chr NA NA NA NA ...
## $ follower_count : num 1 0 23949 17 0 ...
## $ following_count : num 52 0 52 34 0 76 39 46 127 64 ...
## $ account_creation_date : Date, format: "2017-08-30" "2017-10-16" ...
## $ account_language : chr "zh-cn" "zh-cn" "zh-cn" "es" ...
## - attr(*, "spec")=
## .. cols(
## .. userid = col_character(),
## .. user_display_name = col_character(),
## .. user_screen_name = col_character(),
## .. user_reported_location = col_character(),
## .. user_profile_description = col_character(),
## .. user_profile_url = col_character(),
## .. follower_count = col_double(),
## .. following_count = col_double(),
## .. account_creation_date = col_date(format = ""),
## .. account_language = col_character()
## .. )
users_1 %>% str## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 744 obs. of 10 variables:
## $ userid : chr "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE=" "919755217121316864" "747292706536226816" "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw=" ...
## $ user_display_name : chr "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE=" "ailaiyi5" "牛小牛" "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw=" ...
## $ user_screen_name : chr "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE=" "wuming11xia" "gurevadona88" "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw=" ...
## $ user_reported_location : chr NA NA NA NA ...
## $ user_profile_description: chr NA NA NA NA ...
## $ user_profile_url : chr NA NA NA NA ...
## $ follower_count : num 1 0 23949 17 0 ...
## $ following_count : num 52 0 52 34 0 76 39 46 127 64 ...
## $ account_creation_date : Date, format: "2017-08-30" "2017-10-16" ...
## $ account_language : chr "zh-cn" "zh-cn" "zh-cn" "es" ...
## - attr(*, "spec")=
## .. cols(
## .. userid = col_character(),
## .. user_display_name = col_character(),
## .. user_screen_name = col_character(),
## .. user_reported_location = col_character(),
## .. user_profile_description = col_character(),
## .. user_profile_url = col_character(),
## .. follower_count = col_double(),
## .. following_count = col_double(),
## .. account_creation_date = col_date(format = ""),
## .. account_language = col_character()
## .. )
# summary(users_1)
users_1 %>% summary## userid user_display_name user_screen_name
## Length:744 Length:744 Length:744
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## user_reported_location user_profile_description user_profile_url
## Length:744 Length:744 Length:744
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## follower_count following_count account_creation_date
## Min. : 0 Min. : 0.00 Min. :2008-05-16
## 1st Qu.: 0 1st Qu.: 7.75 1st Qu.:2016-06-27
## Median : 2 Median : 36.00 Median :2017-08-30
## Mean : 7991 Mean : 1247.38 Mean :2016-08-22
## 3rd Qu.: 12328 3rd Qu.: 74.00 3rd Qu.:2017-10-17
## Max. :170155 Max. :31752.00 Max. :2019-05-07
## account_language
## Length:744
## Class :character
## Mode :character
##
##
##
users_1這個data frame中的變項並觀察users_1$follower_count## [1] 1 0 23949 17 0 17116 0 1 2 12
## [11] 0 0 23091 1 0 47 29208 0 10 21148
## [21] 12607 17816 0 12919 13986 3 13105 21 0 18
## [31] 58880 0 0 0 1 11786 51608 1 10985 0
## [41] 2 50 0 0 15 2 0 0 0 45860
## [51] 0 64519 2 0 118 0 12512 0 2 0
## [61] 16835 1 0 0 0 45976 1 4 13268 0
## [71] 12033 20133 1 0 3 0 1 1 13349 1
## [81] 5 25315 18900 0 16779 0 2 3 49342 1
## [91] 0 2 0 0 0 24461 10 1 24712 11594
## [101] 0 0 12303 0 21 0 1 0 4 2
## [111] 0 0 0 40213 58 15033 0 0 17808 7
## [121] 2 0 19810 184 12401 13468 650 0 0 0
## [131] 0 3 41065 11817 11769 23786 11659 0 12814 0
## [141] 12 0 41547 1 63428 41522 12695 9954 17800 0
## [151] 65063 48 0 1 16820 0 0 5 21649 1
## [161] 0 14038 0 0 1 3 1 2 12558 1
## [171] 15 0 12698 0 0 43658 15873 52590 14 0
## [181] 0 0 1 0 0 28302 13484 0 10 14340
## [191] 0 11364 0 2 0 0 67522 149 1 0
## [201] 1 45610 49564 58 0 1 0 11371 2 0
## [211] 0 0 0 12651 0 0 2 0 11569 2
## [221] 0 1 0 0 0 1 0 0 1 0
## [231] 0 0 2 0 0 0 32406 24 46733 0
## [241] 1 16252 0 209 29089 0 14904 12633 11461 0
## [251] 11374 11448 1 14957 21820 12801 0 105215 1 0
## [261] 11 30304 16 2 0 1 153 3 56486 0
## [271] 12704 0 0 1 0 0 0 0 12315 0
## [281] 0 0 0 2 6 0 10815 26533 28898 0
## [291] 2 10 33193 0 61815 31 1 3 0 23181
## [301] 0 2 0 12613 0 24 0 4 15895 15013
## [311] 77 0 25612 0 20 1 4 0 0 1
## [321] 0 46799 15 173 0 19503 3 0 7 0
## [331] 84772 1 14575 12520 0 0 12280 1 0 1
## [341] 0 75 1 1 7 0 0 12697 16660 0
## [351] 0 2 1 0 0 6 17562 39047 37 0
## [361] 25627 3 3 12755 0 2 14193 17640 0 41844
## [371] 1 477 0 1 0 0 28072 0 57103 1
## [381] 0 53238 0 7 5 6 2 63602 0 3
## [391] 19146 12104 0 0 0 21333 55013 2 13915 0
## [401] 0 2 11852 2 1 5 0 5 0 0
## [411] 21 0 20 0 3 12717 0 0 64398 0
## [421] 0 17495 1 105754 0 15783 66675 21127 22299 170155
## [431] 2 0 12059 0 14627 24169 1 0 38 16609
## [441] 9253 26091 0 1 11144 10895 12240 5 12765 0
## [451] 1 0 1 28156 22 9971 0 17950 3 0
## [461] 7 4 6 14694 288 2 1 1 7 1
## [471] 1 3 44 0 12941 25714 0 69659 0 0
## [481] 0 0 6 1 24071 28405 52097 2 51666 0
## [491] 0 0 0 23737 3 20414 0 1 157 17861
## [501] 22548 0 145 0 13468 1564 1 9 48 52136
## [511] 24442 17 76718 0 0 0 0 15398 2 2
## [521] 0 29549 0 0 0 2 17694 24058 0 0
## [531] 23989 93490 7 27352 46017 30080 8 1 4 44
## [541] 0 15749 76750 0 7 5 0 0 307 0
## [551] 11423 17746 0 0 31090 18 0 0 0 0
## [561] 0 1 0 149 0 0 0 1 109 5
## [571] 11 0 1 3 0 17 4 0 6 13
## [581] 39316 43358 0 0 4 13449 2 0 4 0
## [591] 14130 17434 1 0 0 1 22861 0 13344 0
## [601] 0 0 0 0 0 0 1 1 12608 1
## [611] 0 13277 1 165 16837 0 0 1 0 3
## [621] 1 0 1 12061 7 1 0 15757 0 12979
## [631] 0 130077 1 42042 1 3 1 0 18662 0
## [641] 17076 0 2 311 13235 11625 0 0 0 36
## [651] 176 0 0 1 16489 0 0 8 0 2
## [661] 6 0 2 1 0 21081 0 11401 14708 0
## [671] 22873 0 0 0 2 0 100847 21024 30554 3
## [681] 0 4 0 1 12659 0 24205 0 1 0
## [691] 0 16366 0 0 0 0 0 0 0 1
## [701] 1 6 2 1 12365 1 2 0 0 3
## [711] 15 14843 0 1 17 13553 13402 5 127 1
## [721] 2 13138 272 23 11232 12648 12757 1 1 0
## [731] 14558 57652 2 1 6 18374 1 50481 69841 2
## [741] 75 18781 0 22551
users_1$follower_count %>% class## [1] "numeric"
users_1$user_screen_name %>% head## [1] "vMm2zemFOF7kmXoDyX24Bo+TorqhNutpZlATYyxsE="
## [2] "wuming11xia"
## [3] "gurevadona88"
## [4] "q2SMGvHasu+nugbpNMDCjr2qlZp3FCiGYDLht+gW5pw="
## [5] "lishuishi"
## [6] "baillopud5"
users_1$user_screen_name %>% class## [1] "character"
tweets_1資料表中的哪一個變數的內容和users_1的哪一個變數內容應該相同?users_1中,以下變數的資料型態為何?account_creation_date、is_retweet、 tweet_time?# YOUR CODE HERE# install.packages("skimr")
library(skimr)##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
users_1 %>% skim()select(): 選取必要的欄(col),也就是變數、變項(variable)filter(): 篩出符合條件的列(row),也就是資料項、觀察值(Observation)mutate(): 在該dataframe中,從其他變項產生新的變項(例如相加)select(): 選取必要的欄(col),也就是變數、變項(variable)# selected <- select(users, userid, user_display_name, user_screen_name)
selected <- users_1 %>%
select(userid, user_display_name, user_screen_name)filter(): 篩出符合條件的列(row),也就是資料項、觀察值(Observation)users_1 %>%
filter(nchar(user_screen_name) > 20)mutate(): 在該dataframe中,從其他變項產生新的變項(例如相加)tweets_1 <- read_csv("data/china_082019_1_tweets_csv_hashed.csv")## Parsed with column specification:
## cols(
## .default = col_character(),
## tweetid = col_double(),
## userid = col_double(),
## user_profile_url = col_logical(),
## follower_count = col_double(),
## following_count = col_double(),
## account_creation_date = col_date(format = ""),
## tweet_time = col_datetime(format = ""),
## in_reply_to_userid = col_double(),
## in_reply_to_tweetid = col_double(),
## quoted_tweet_tweetid = col_double(),
## is_retweet = col_logical(),
## retweet_userid = col_logical(),
## retweet_tweetid = col_logical(),
## quote_count = col_double(),
## reply_count = col_double(),
## like_count = col_double(),
## retweet_count = col_double(),
## poll_choices = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 951646 parsing failures.
## row col expected actual file
## 1003 retweet_userid 1/0/T/F/TRUE/FALSE 2896172507 'data/china_082019_1_tweets_csv_hashed.csv'
## 1003 retweet_tweetid 1/0/T/F/TRUE/FALSE 581033920591859712 'data/china_082019_1_tweets_csv_hashed.csv'
## 1052 userid a double nXJwGmcG71Ho3srqnXTSnyEokZXke8tkdISrrycI= 'data/china_082019_1_tweets_csv_hashed.csv'
## 1052 user_profile_url 1/0/T/F/TRUE/FALSE http://t.co/x5Ctw87c 'data/china_082019_1_tweets_csv_hashed.csv'
## 1076 retweet_userid 1/0/T/F/TRUE/FALSE 2896172507 'data/china_082019_1_tweets_csv_hashed.csv'
## .... ................ .................. ......................................... ...........................................
## See problems(...) for more details.
users_1 %>%
mutate(ff_perc = follower_count / following_count) %>%
select(ff_perc, follower_count, following_count, user_screen_name) %>%
head# Checking tweetid's variable type in data frame tweets_1
tweets_1$tweetid %>% class## [1] "numeric"
# Converting tweetid's data type from numeric to character by as.character()
tweets_1 %>%
mutate(tweetid = as.character(tweetid))filter(), select(), mutate())後,原本的dataframe內容仍不會改變,必須要把他assign以覆蓋掉原本的dataframe,才會實際上改變這個dataframe。# Not assigning to replace original datafrmae
users_1 %>%
mutate(ff_perc = follower_count / following_count)# Assigning to replace original dataframe
users_1 <- users_1 %>%
mutate(ff_perc = follower_count / following_count)
# Assigning to a NEW dataframe
test_1 <- users_1 %>%
mutate(ff_perc = follower_count / following_count)tweets_1,並觀察tweetid與userid兩個變項的資料型態users_1中userid的資料型態是否和上述tweets_1中的userid資料型態相同tweets_1中的usersid和tweetid兩個變數從numeric型態轉為character型態。# tweets_1 <- read_csv("data/china_082019_1_tweets_csv_hashed.csv")# counting account language
users_1 %>%
count(account_language)# counting account language in descending order
users_1 %>%
count(account_language, sort = T)